R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Cleaning

Semester Data

library("tidyverse")

##load the dataset
semester_data_1 <- read_csv("raw/semester_dummy/semester_data_1.csv")
semester_data_2 <- read_csv("raw/semester_dummy/semester_data_2.csv")
head(semester_data_1)
head(semester_data_2)
##rename the column names in "semester_data_1"
semester_data_1 <- semester_data_1 |>
  rename (unitid = x1, instnm = x2, semester = x3, quarter = x4, year = x5, Y = x6)
#head(semester_data_1)
##convert the character into double (numeric)
##then, delete the first row of the "semester_data_1"

semester_data_1 <- semester_data_1 |>
  mutate(unitid = as.numeric(unitid), semester = as.numeric(semester), quarter = as.numeric(quarter), year = as.numeric(year), Y = as.numeric(Y)) |>
  slice(-1)
#head(semester_data_1)
##rename the column names in "semester_data_2"
semester_data_2 <- semester_data_2 |>
  rename (unitid = x1, instnm = x2, semester = x3, quarter = x4, year = x5, Y = x6)
#head(semester_data_2)
##join the two data sets with bind_rows() because they have exactly the same columns
semester_data <- bind_rows(semester_data_1, semester_data_2)
#head(semester_data)
##delete the column "Y"
semester_data <- semester_data |>
  select (-Y) #|>
  ##mutate(instnm = toupper(instnm))
#head(semester_data)
##identify the transitioned year for each university
transition_year <- semester_data |>
  #group_by(unitid) |>
  mutate(prev_semester = lag(semester),
         prev_quarter = lag(quarter),
         .by = year)  |>
  filter(prev_semester == 0 & semester == 1 & prev_quarter == 1 & quarter == 0) |>
  rename(yearofsem = year) |>
  select (unitid, yearofsem) #should not add "instnm" here as upper cases and lower cases are mixed
#head(transition_year)
##use left_join to merge the two data sets ("semester_data" and "transition_year")
semester_dummy <- semester_data |>
  left_join (transition_year, by = "unitid") #joined only by "unitid"
#head(semester_dummy)
##finalizing the cleaning of the semester data
##create a dummy variable which shows 0 if semesters were yet to be introduced and 1 if already introduced
cleaned_semester_dummy <- semester_dummy |>
  #group_by(unitid) |>
  mutate(after = if_else(year < yearofsem, "0", "1"),
         .by = year) |>
  mutate(after = as.numeric(after))
cleaned_semester_dummy
##checking the number of NAs in "cleaned_semester_dummy"
#sum(is.na(cleaned_semester_dummy)) #total number of NAs: 25688

##the semester data set provided by the organizer
#semester_by_organizer <- read_csv("intermediate_by_organizer/clean_semester_dummy.csv") |>
  #select(-1)
#s#emester_by_organizer
#checking the number of NAs in "semester_by_organizer"
#sum(is.na(semester_by_organizer)) #identical with "cleaned_semester_dummy"
##save this cleaned data set "cleaned_semester_dummy" as a .csv file
##remember to add ".csv" at the end
#write_csv(cleaned_semester_dummy, "cleaned_semester_aomi.csv")

Gradrate Data (Outcome)

##use the intermediate data for the tentative use

##create empty tibble
df_gradrate <- tibble()

##loop through each year from 1991 to 2016
for (i in c(c(1991:1993), c(1995:2016))) { #excluding 1994
  #construct the filename
  filename <- paste0("raw/outcome_csv_aomi/", i, ".csv")
  gradrate_data <- read_csv(filename)
  df_gradrate <- bind_rows(df_gradrate, gradrate_data)  #repeat binding "df_gradrate" and "gradrate_data"
}
head(df_gradrate)
##calculate female's 4-year graduation rate
df_gradrate <- df_gradrate |>
  mutate(women_gradrate_4yr = as.numeric(women_gradrate_4yr)) |>
  mutate(womengradrate4yr = 0.01 * women_gradrate_4yr)
#head(df_gradrate)
#calculate total 4-year graduation rate, then covert into numeric factors
#calculate male's 4-year graduation rate, then covert into numeric factors
df_gradrate <- df_gradrate |>
  mutate(gradrate4yr = tot4yrgrads / totcohortsize,
         mengradrate4yr = m_4yrgrads / m_cohortsize)
#head(df_gradrate)
##use round() to control the number of digit
df_gradrate <- df_gradrate |>
  mutate(gradrate4yr = round(gradrate4yr, digits = 3)) |>
  mutate(mengradrate4yr = round(mengradrate4yr, digits = 3))
#head(df_gradrate)
##limit the range of years in the data set
outcome_data <- df_gradrate |>
  filter (year >= 1991 & year <= 2010)
outcome_data
#write_csv(outcome_data, "cleaned_outcome_aomi.csv")

Covariates Data

##load the data
covariates <- read_csv("raw/covariates_csv_aomi/covariates.csv")
#head(covariates)
##rename the column name from "university_id" into "unitid"
##then, create a new column with the same name while removing "aaaa" from each row
covariates <- covariates |>
  rename(unitid = university_id) |>
  mutate(unitid = str_remove_all(unitid, "aaaa"))
#head(covariates)
##see all the unique values in the "category" column
#unique(covariates$category)

##use "pivot_wider" to transfer the dataset
covariates <- covariates |>
  pivot_wider(names_from = category,
              values_from = value)
#head(covariates)
##recall "cleaned_semester_dummy"
#cleaned_semester_dummy
##recall "outcome_data"
#outcome_data
##see all the unique values existing in the "year" column of "cleaned_semester_dummy" data
unique(cleaned_semester_dummy$year)
##  [1] 1991 1992 1993 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006
## [16] 2007 2008 2009 2010
##see all the unique values existing in the "year" column of "outcome_data"
unique(outcome_data$year)
##  [1] 1991 1992 1993 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006
## [16] 2007 2008 2009 2010
##limit the year duration for "covariates" data
covariates <- covariates |>
  group_by(unitid) |>
  filter (year >= 1991 & year != 1994 & year <= 2010)
#head(covariates)
##see all the unique values existing in the "unitid" column in the "outcome_data" data
#unique(outcome_data$unitid)
##see all the unique values existing in the "unitid" column in the "covariates" data
#unique(covariates$unitid)

##filter the "unitid" column to keep only the rows that have the same value in "outcome_data" data set.

#pipe operators cannot be used within functions
#"unitid == outcome_data$unitid": This checks if unitid is equal to the entire vector outcome$unitid, which is not what you want if outcome$unitid contains multiple values. It would result in a logical vector of TRUE/FALSE that doesn't directly help in filtering rows.
#"unitid %in% outcome_data$unitid": This checks if each value in unitid is present in the outcome$unitid vector. It creates a logical vector where each element is TRUE if the corresponding unitid value is found in outcome$unitid and FALSE otherwise. This is useful for filtering rows based on the presence of values in a list or vector.

covariates_data <- covariates |>
  filter(unitid %in% (outcome_data$unitid)) |>
  #convert characters into double ("unitid" column)
  mutate(unitid = as.numeric(unitid))
#head(covariates_data)
#write_csv(covariates_data, "cleaned_covariates_aomi.csv")

Master Data

##Semester Data: cleaned_semester_dummy
##Gradrate Data: outcome_data
##Covariates Data: covariates_data
#cleaned_semester_dummy
#outcome_data
#covariates_data
##combine three data sets by "unitid (double)"
pre_master_data <- cleaned_semester_dummy |>
  left_join(outcome_data, by = c("unitid", "year"))
#pre_master_data

master_data <- pre_master_data |>
  left_join(covariates_data, by = c("unitid", "year")) |>
  #calculate the ratio of women cohorts and white cohorts, respectively
  mutate(per_white_cohort = white_cohortsize / totcohortsize,
         per_women_cohort = w_cohortsize / totcohortsize)
master_data
### Compare my cleaned data and data by organizer (covariates data) : identical!
##mine
#sum(is.na(covariates_data)) #total number of NAs: 0

##by organizer
#covariates_by_organizer <- read_csv("intermediate_by_organizer/clean_covariates.csv")
#sum(is.na(covariates_by_organizer)) #total number of NAs: 0
#write_csv(master_data, "master_data_aomi.csv")

Analysis

##count the total number of NAs in "master_data"
#master_data
#sum(is.na(master_data)) #total number of NAs: 25801

##load the cleaned master data provided by the organizers
#Master <- read_csv("intermediate_by_organizer/master.csv")
##count the number of NAs in each column in "master_data"
master_data |>
  apply(2, function(x) sum(is.na(x))) #1 for row, 2 for column
##             unitid             instnm           semester            quarter 
##                  0                  0                  0                  0 
##               year          yearofsem              after      totcohortsize 
##                  0              12730              12730                  0 
##       w_cohortsize       m_cohortsize        tot4yrgrads         m_4yrgrads 
##                  0                  0                  0                  0 
##         w_4yrgrads women_gradrate_4yr   womengradrate4yr        gradrate4yr 
##                  0                138                138                  0 
##     mengradrate4yr     instatetuition              costs            faculty 
##                 65                  0                  0                  0 
##   white_cohortsize   per_white_cohort   per_women_cohort 
##                  0                  0                  0

Replicate Table 1

##switchers: TRUE / never-switchers: FALSE
#abs(): absolute value

transition <- master_data |>
  group_by(unitid) |>
  mutate(semester_change = abs(semester - lag(semester, default = first(semester))), 
         quarter_change = abs(quarter - lag(quarter, default = first(quarter))),
         switching = (semester_change + quarter_change) > 0)  |>
  filter(switching == TRUE) |>
  select(unitid, switching)

##table: dataset containing the column called "switching"
table <- master_data |>
  left_join(transition, by = "unitid") |>
  mutate(switching = if_else(is.na(switching),"Never-Switchers", "Switchers"))
head(table)
##compare the graduation rate between semesters and non-semesters
master_data |>
  summarize(
    mean = mean(gradrate4yr, na.rm = TRUE),
    sd = sd(gradrate4yr, na.rm = TRUE),
    .by = semester
  ) |>
  knitr::kable(col.names = c("Semester", "Mean_Gradiation_Rate", "Standard_Deviation"))
Semester Mean_Gradiation_Rate Standard_Deviation
1 0.3765025 0.2262895
0 0.3041520 0.2246006

Replicate Figure 1: 4-year Graduation Rate

library(ggplot2)

avg_gradrate4yr <- master_data |>
  mutate(avg_gradrate4yr = mean(gradrate4yr),
         .by = year)

avg_gradrate4yr |>
  ggplot(mapping = aes(x = year,
                       y = avg_gradrate4yr)) +
  geom_line() +
  xlim(1990, 2010) + #set the minimum and maximum limits on the x-axis
  ylim(0.25, 0.45) +
  labs(x = "Year",
       y = "4-year Graduation Rate")

Replicate Figure 1: fraction of schools on semesters

semesters_share <- master_data |>
  mutate(semesters_share = sum(semester) / n(),
         .by = year)

semesters_share |>
  ggplot(mapping = aes(x = year, 
                       y = semesters_share)) +
  geom_line() +
  xlim(1990, 2010) + 
  ylim(0.8, 1.0) +
  labs (x = "Year",
        y = "Fractions of Schools on Semesters") 

###Scatter Plot
#colors()

##if creating the scatter plots manually:
#master_data |>
  #ggplot(mapping = aes(x = per_women_cohort, y = gradrate4yr)) +
  #geom_point(color = "skyblue", alpha = 0.5)

#master_data |>
  #ggplot(mapping = aes(x = per_white_cohort, y = gradrate4yr)) +
  #geom_point(color = "violet", alpha = 0.5)

#master_data |>
  #ggplot(mapping = aes(x = instatetuition, y = gradrate4yr)) +
  #geom_point(color = "springgreen", alpha = 0.5)

##if using rlang package
library(rlang)
library(ggplot2)

plot_gradrate <- function(data, xvar, color_value, xaxis) {
  xvar <- enquo(xvar) #quosing
  
  ggplot(data, mapping = aes(x = !!xvar, y = gradrate4yr)) + #dynamic
    geom_point(color = color_value, alpha = 0.5) + #static
    labs(x = xaxis, #static
       y = "4-year Graduation Rate")
}

plot_gradrate(master_data, per_women_cohort, "skyblue", "Female Student Shares")

plot_gradrate(master_data, per_white_cohort, "violet", "White Student Shares")

plot_gradrate(master_data, instatetuition, "springgreen", "Tuition")

### Regression Analysis

##Independent Variable: semester
##Dependent Variable: gradrate4yr
library(modelsummary)

##run linear regression
model <- lm(gradrate4yr ~ semester, data = master_data)
summary(model)
## 
## Call:
## lm(formula = gradrate4yr ~ semester, data = master_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.37550 -0.20150 -0.02815  0.17350  0.60585 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.304152   0.006591   46.15   <2e-16 ***
## semester    0.072351   0.006752   10.72   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2262 on 25002 degrees of freedom
## Multiple R-squared:  0.004572,   Adjusted R-squared:  0.004532 
## F-statistic: 114.8 on 1 and 25002 DF,  p-value: < 2.2e-16
##create and display tables
modelsummary(model)
tinytable_fevljo53w91eq7q308zj
(1)
(Intercept) 0.304
(0.007)
semester 0.072
(0.007)
Num.Obs. 25004
R2 0.005
R2 Adj. 0.005
AIC -3364.1
BIC -3339.8
Log.Lik. 1685.068
RMSE 0.23